AllLife Bank is a US bank that has a growing customer base. The majority of these customers are liability customers (depositors) with varying sizes of deposits. The number of customers who are also borrowers (asset customers) is quite small, and the bank is interested in expanding this base rapidly to bring in more loan business and in the process, earn more through the interest on loans. In particular, the management wants to explore ways of converting its liability customers to personal loan customers (while retaining them as depositors).
A campaign that the bank ran last year for liability customers showed a healthy conversion rate of over 9% success. This has encouraged the retail marketing department to devise campaigns with better target marketing to increase the success ratio.
You as a Data scientist at AllLife bank have to build a model that will help the marketing department to identify the potential customers who have a higher probability of purchasing the loan.
# this will help in making the Python code more structured automatically (good coding practice)
%load_ext nb_black
# Library to suppress warnings or deprecation notes
import warnings
warnings.filterwarnings("ignore")
# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np
# Library to split data
from sklearn.model_selection import train_test_split
# libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)
# Library to build Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
# Libraries to build decision tree classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
# To tune different models
from sklearn.model_selection import GridSearchCV
# To get diferent metric scores
from sklearn.metrics import (
f1_score,
accuracy_score,
recall_score,
precision_score,
confusion_matrix,
plot_confusion_matrix,
make_scorer,
roc_auc_score,
precision_recall_curve,
roc_curve,
)
loan_df = pd.read_csv("Loan_Modelling.csv")
# copying data to another varaible to avoid any changes to original data
data = loan_df.copy()
data.head() # first 5 rows of the dataset
data.tail() # last 5 rows of the dataset
p_loan = data["Personal_Loan"]
data.drop(["Personal_Loan"], axis=1, inplace=True)
data["Personal_Loan"] = p_loan
data.head(5)
data.shape
data.info()
data[data.duplicated()].count()
data.isnull().sum()
data.describe().T
neg_experience = data["Experience"] < 0
neg_experience.value_counts()
data["Experience"] = data["Experience"].apply(abs)
data[data["Experience"] < 0][
"Experience"
].value_counts() ###checking for negative values
from uszipcode import SearchEngine
search = SearchEngine()
county = []
for i in np.arange(0, len(data["ZIPCode"])):
zipcode = search.by_zipcode(data["ZIPCode"][i])
if zipcode is not None:
county.append(zipcode.county)
else:
county.append("NoMatch")
data["County"] = county
data.head()
cat_columns = [
"Family",
"Education",
"Securities_Account",
"CD_Account",
"Online",
"CreditCard",
"Personal_Loan",
"County",
]
for i in cat_columns:
print(data[i].value_counts(normalize=True))
print("-" * 50)
Superior_California = [
"Butte County",
"El Dorado County",
"Yolo County",
"Sacramento County",
"Placer County",
"Shasta County",
"Siskiyou County",
]
North_Coast = [
"Humboldt County",
"Lake County",
"Napa County",
"Trinity County",
"Mendocino County",
"Sonoma County",
]
SanFrancisco = [
"Alameda County",
"Contra Costa County",
"Marin County",
"San Francisco County",
"San Mateo County",
"Santa Clara County",
"Solano County",
]
Northern_San_Joaquin = [
"Alpine County",
"Amador County",
"Calaveras County",
"Madera County",
"Mariposa County",
"Merced County",
"Mono County",
"San Joaquin County",
"Stanislaus County",
"Tuolumne County",
]
Central_Coast = [
"Monterey County",
"San Benito County",
"San Luis Obispo County",
"Santa Barbara County",
"Santa Cruz County",
"Ventura County",
]
Southern_San_Joaquin = [
"Fresno County",
"Inyo County",
"Kern County",
"Kings County",
"Tulare County",
]
Inland_Empire = ["Riverside County", "San Bernardino County"]
Los_Angeles = ["Los Angeles County"]
Orange = ["Orange County"]
SanDiego = ["San Diego County", "Imperial County"]
other = ["NoMatch"]
def region_combining(x):
if x in Superior_California:
return "Superior_California"
elif x in North_Coast:
return "North_Coast"
elif x in SanFrancisco:
return "SanFrancisco"
elif x in Northern_San_Joaquin:
return "Northern_San_Joaquin"
elif x in Central_Coast:
return "Central_Coast"
elif x in Southern_San_Joaquin:
return "Southern_San_Joaquin"
elif x in Inland_Empire:
return "Inland_Empire"
elif x in Los_Angeles:
return "Los_Angeles"
elif x in Orange:
return "Orange"
elif x in SanDiego:
return "SanDiego"
elif x in other:
return "other"
else:
return x
data["region"] = data["County"].apply(region_combining)
print("Distinct values in Region column:", data.region.unique())
data.region.value_counts(normalize=True)
def histogram_boxplot(data, feature, figsize=(15, 10), kde=False, bins=None):
"""
Boxplot and histogram combined
data: dataframe
feature: dataframe column
figsize: size of figure (default (15,10))
kde: whether to show the density curve (default False)
bins: number of bins for histogram (default None)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2, # Number of rows of the subplot grid= 2
sharex=True, # x-axis will be shared among all subplots
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
) # creating the 2 subplots
sns.boxplot(
data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
) # boxplot will be created and a star will indicate the mean value of the column
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2
) # For histogram
ax_hist2.axvline(
data[feature].mean(), color="green", linestyle="--"
) # Add mean to the histogram
ax_hist2.axvline(
data[feature].median(), color="black", linestyle="-"
) # Add median to the histogram
histogram_boxplot(data, "Age")
histogram_boxplot(data, "Experience")
histogram_boxplot(data, "Income")
histogram_boxplot(data, "CCAvg")
histogram_boxplot(data, "Mortgage")
# function to create labeled barplots
def labeled_barplot(data, feature, perc=False, n=None):
"""
Barplot with percentage at the top
data: dataframe
feature: dataframe column
perc: whether to display percentages instead of count (default is False)
n: displays the top n category levels (default is None, i.e., display all levels)
"""
total = len(data[feature]) # length of the column
count = data[feature].nunique()
if n is None:
plt.figure(figsize=(count + 1, 5))
else:
plt.figure(figsize=(n + 1, 5))
plt.xticks(rotation=90, fontsize=15)
ax = sns.countplot(
data=data,
x=feature,
palette="Paired",
order=data[feature].value_counts().index[:n].sort_values(),
)
for p in ax.patches:
if perc == True:
label = "{:.1f}%".format(
100 * p.get_height() / total
) # percentage of each class of the category
else:
label = p.get_height() # count of each level of the category
x = p.get_x() + p.get_width() / 2 # width of the plot
y = p.get_height() # height of the plot
ax.annotate(
label,
(x, y),
ha="center",
va="center",
size=12,
xytext=(0, 5),
textcoords="offset points",
) # annotate the percentage
plt.show() # show the plot
labeled_barplot(data, "Family", perc=True)
labeled_barplot(data, "Securities_Account", perc=True)
labeled_barplot(data, "Personal_Loan", perc=True)
labeled_barplot(data, "CD_Account", perc=True)
labeled_barplot(data, "Online", perc=True)
labeled_barplot(data, "CreditCard", perc=True)
labeled_barplot(data, "region", perc=True)
plt.figure(figsize=(15, 7))
sns.heatmap(data.corr(), annot=True, vmin=-1, vmax=1, cmap="Spectral")
plt.show()
plt.figure(figsize=(15, 7))
sns.pairplot(data=data, hue="Personal_Loan")
plt.show()
plt.figure(figsize=(15, 7))
sns.scatterplot(x="CCAvg", y="Income", hue="Personal_Loan", data=data)
plt.figure(figsize=(15, 7))
sns.scatterplot(x="Mortgage", y="Income", hue="Personal_Loan", data=data)
cols = data[
[
"Income",
"CCAvg",
"Mortgage",
"Experience",
"Age",
]
].columns.tolist()
plt.figure(figsize=(12, 10))
for i, variable in enumerate(cols):
plt.subplot(4, 2, i + 1)
sns.boxplot(
data["Education"],
data[variable],
hue=data["Personal_Loan"],
palette="PuBu",
showfliers=False, # turning-off outliers
)
plt.tight_layout()
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
plt.title(variable)
plt.show()
# function to plot stacked bar chart
def stacked_barplot(data, predictor, target):
"""
Print the category counts and plot a stacked bar chart
data: dataframe
predictor: independent variable
target: target variable
"""
count = data[predictor].nunique()
sorter = data[target].value_counts().index[-1]
tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
by=sorter, ascending=False
)
print(tab1)
print("-" * 120)
tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
by=sorter, ascending=False
)
tab.plot(kind="bar", stacked=True, figsize=(count + 5, 6))
plt.legend(
loc="lower left",
frameon=False,
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
plt.show()
stacked_barplot(data, "Family", "Personal_Loan")
stacked_barplot(data, "Securities_Account", "Personal_Loan")
stacked_barplot(data, "Online", "Personal_Loan")
stacked_barplot(data, "region", "Personal_Loan")
stacked_barplot(data, "CreditCard", "Personal_Loan")
stacked_barplot(data, "CD_Account", "Personal_Loan")
## Dropping the Experience, ID, ZIPCode and County Columns
data.drop(["ID", "ZIPCode", "County", "Experience"], axis=1, inplace=True)
numerical_col = data.select_dtypes(include=np.number).columns.tolist()
plt.figure(figsize=(20, 30))
for i, variable in enumerate(numerical_col):
plt.subplot(5, 4, i + 1)
plt.boxplot(data[variable], whis=1.5)
plt.tight_layout()
plt.title(variable)
plt.show()
# functions to treat outliers by flooring and capping
# Applying the outlier treatment function and excluding the transformed variables and Year, mileage and seats.
p25 = data[["Income", "CCAvg", "Mortgage"]].quantile(0.25) # Lower whisker boundary
p75 = data[["Income", "CCAvg", "Mortgage"]].quantile(0.75) # Upper Whisker Boundary
data[["Income", "CCAvg", "Mortgage"]] = data[["Income", "CCAvg", "Mortgage"]].clip(
p25, p75, axis=1
) # outlier clipping
# let's look at box plot to see if outliers have been treated or not
plt.figure(figsize=(20, 30))
for i, variable in enumerate(numerical_col):
plt.subplot(5, 4, i + 1)
plt.boxplot(data[variable], whis=1.5)
plt.tight_layout()
plt.title(variable)
plt.show()
X = data.drop(["Personal_Loan"], axis=1)
Y = data["Personal_Loan"]
X = pd.get_dummies(X, drop_first=True)
# Splitting data in train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.30, random_state=1
)
# Splitting data into training and test set:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
print(X_train.shape, X_test.shape)
print("Number of rows in train data =", X_train.shape[0])
print("Number of rows in test data =", X_test.shape[0])
print("Percentage of classes in training set:")
print(y_train.value_counts(normalize=True))
print("Percentage of classes in test set:")
print(y_test.value_counts(normalize=True))
Recall should be maximized, the greater the recall higher the chances of minimizing the false negativesdef model_performance_classification_sklearn_with_threshold(model, predictors, target, threshold=0.5):
"""
Function to compute different metrics, based on the threshold specified, to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
threshold: threshold for classifying the observation as class 1
"""
# predicting using the independent variables
pred_prob = model.predict_proba(predictors)[:, 1]
pred_thres = pred_prob > threshold
pred = np.round(pred_thres)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{
"Accuracy": acc,
"Recall": recall,
"Precision": precision,
"F1": f1,
},
index=[0],
)
return df_perf
# defining a function to plot the confusion_matrix of a classification model built using sklearn
def confusion_matrix_sklearn_with_threshold(model, predictors, target, threshold=0.5):
"""
To plot the confusion_matrix, based on the threshold specified, with percentages
model: classifier
predictors: independent variables
target: dependent variable
threshold: threshold for classifying the observation as class 1
"""
pred_prob = model.predict_proba(predictors)[:, 1]
pred_thres = pred_prob > threshold
y_pred = np.round(pred_thres)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
# There are different solvers available in Sklearn logistic regression
# The newton-cg solver is faster for high-dimensional data
lg = LogisticRegression(solver="newton-cg", random_state=1)
model = lg.fit(X_train, y_train)
coef_df = pd.DataFrame(
np.append(lg.coef_, lg.intercept_),
index=X_train.columns.tolist() + ["Intercept"],
columns=["Coefficients"],
)
coef_df.T
# converting coefficients to odds
odds = np.exp(lg.coef_[0])
# finding the percentage change
perc_change_odds = (np.exp(lg.coef_[0]) - 1) * 100
# removing limit from number of columns to display
pd.set_option("display.max_columns", None)
# adding the odds to a dataframe
pd.DataFrame({"Odds": odds, "Change_odd%": perc_change_odds}, index=X_train.columns).T
Age: Holding all other features constant a 1 unit change in Age will increase the odds of a person to accept a loan by 1 times or a 0.43% increase in odds of accepting a loan.Income: Holding all other features constant a 1 unit change in Income will increase the odds of a person to accept a loan by 1.3 times or a 28.8% increase in odds of accepting a loan.Family: Holding all other features constant a 1 unit change in Family will increase the odds of a person to accept a loan by 1.8 times or a 84.8% increase in odds of accepting a loan.CCAvg: Holding all other features constant a 1 unit change in CCAvg will increase the odds of a person to accept a loan by 2.5 times or a 154% increase in odds of accepting a loan.Education: Holding all other features constant a 1 unit change in Education will increase the odds of a person to accept a loan by 3.8 times or a 278% increase in odds of accepting a loan.Mortgage: Holding all other features constant a 1 unit change in Mortgage will increase the odds of a person to accept a loan by 1 times or a 0.15% increase in odds of accepting a loan.Securities Account: Holding all other features constant a 1 unit change in Securities Account will increase the odds of a person to accept a loan by 0.35 times but will have a 64% decrease in odds of accepting a loan.CD Account: Holding all other features constant a 1 unit change in CD Account will increase the odds of a person to accept a loan by 23.8 times or a 2283% increase in odds of accepting a loan.Online: Holding all other features constant a 1 unit change in Online will increase the odds of a person to accept a loan by 0.65 times but will have a 34% decrease in odds of accepting a loan.CreditCard: Holding all other features constant a 1 unit change in CreditCard will increase the odds of a person to accept a loan by 0.32 times but will have a 68% decrease in odds of accepting a loan.Regional_tier : Holding all other features constant a 1 unit change will follow same interpretations based on the odd and change_odd% values. Interpretation for other attributes can be done similarly.
# creating confusion matrix
confusion_matrix_sklearn_with_threshold(lg, X_train, y_train)
log_reg_model_train_perf = model_performance_classification_sklearn_with_threshold(
lg, X_train, y_train
)
print("Training performance:")
log_reg_model_train_perf
# creating confusion matrix
confusion_matrix_sklearn_with_threshold(lg, X_test, y_test)
log_reg_model_test_perf = model_performance_classification_sklearn_with_threshold(
lg, X_test, y_test
)
print("Test set performance:")
log_reg_model_test_perf
logit_roc_auc_train = roc_auc_score(y_train, lg.predict_proba(X_train)[:, 1])
fpr, tpr, thresholds = roc_curve(y_train, lg.predict_proba(X_train)[:, 1])
plt.figure(figsize=(7, 5))
plt.plot(fpr, tpr, label="Logistic Regression (area = %0.2f)" % logit_roc_auc_train)
plt.plot([0, 1], [0, 1], "r--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic")
plt.legend(loc="lower right")
plt.show()
logit_roc_auc_test = roc_auc_score(y_test, lg.predict_proba(X_test)[:, 1])
fpr, tpr, thresholds = roc_curve(y_test, lg.predict_proba(X_test)[:, 1])
plt.figure(figsize=(7, 5))
plt.plot(fpr, tpr, label="Logistic Regression (area = %0.2f)" % logit_roc_auc_test)
plt.plot([0, 1], [0, 1], "r--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic")
plt.legend(loc="lower right")
plt.show()
# Optimal threshold as per AUC-ROC curve
# The optimal cut off would be where tpr is high and fpr is low
fpr, tpr, thresholds = roc_curve(y_train, lg.predict_proba(X_train)[:, 1])
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold_auc_roc = thresholds[optimal_idx]
print(optimal_threshold_auc_roc)
# creating confusion matrix
confusion_matrix_sklearn_with_threshold(
lg, X_train, y_train, threshold=optimal_threshold_auc_roc
)
# checking model performance for this model
log_reg_model_train_perf_threshold_auc_roc = (
model_performance_classification_sklearn_with_threshold(
lg, X_train, y_train, threshold=optimal_threshold_auc_roc
)
)
print("Training performance:")
log_reg_model_train_perf_threshold_auc_roc
# creating confusion matrix
confusion_matrix_sklearn_with_threshold(
lg, X_test, y_test, threshold=optimal_threshold_auc_roc
)
# checking model performance for this model
log_reg_model_test_perf_threshold_auc_roc = (
model_performance_classification_sklearn_with_threshold(
lg, X_test, y_test, threshold=optimal_threshold_auc_roc
)
)
print("Test set performance:")
log_reg_model_test_perf_threshold_auc_roc
y_scores = lg.predict_proba(X_train)[:, 1]
prec, rec, tre = precision_recall_curve(
y_train,
y_scores,
)
def plot_prec_recall_vs_tresh(precisions, recalls, thresholds):
plt.plot(thresholds, precisions[:-1], "b--", label="precision")
plt.plot(thresholds, recalls[:-1], "g--", label="recall")
plt.xlabel("Threshold")
plt.legend(loc="upper left")
plt.ylim([0, 1])
plt.figure(figsize=(10, 7))
plot_prec_recall_vs_tresh(prec, rec, tre)
plt.show()
# setting the threshold
optimal_threshold_curve = 0.38
# creating confusion matrix
confusion_matrix_sklearn_with_threshold(
lg, X_train, y_train, threshold=optimal_threshold_curve
)
log_reg_model_train_perf_threshold_curve = (
model_performance_classification_sklearn_with_threshold(
lg, X_train, y_train, threshold=optimal_threshold_curve
)
)
print("Training performance:")
log_reg_model_train_perf_threshold_curve
# creating confusion matrix
confusion_matrix_sklearn_with_threshold(
lg, X_test, y_test, threshold=optimal_threshold_curve
)
log_reg_model_test_perf_threshold_curve = (
model_performance_classification_sklearn_with_threshold(
lg, X_test, y_test, threshold=optimal_threshold_curve
)
)
print("Test set performance:")
log_reg_model_test_perf_threshold_curve
# training performance comparison
models_train_comp_df = pd.concat(
[
log_reg_model_train_perf.T,
log_reg_model_train_perf_threshold_auc_roc.T,
log_reg_model_train_perf_threshold_curve.T,
],
axis=1,
)
models_train_comp_df.columns = [
"Logistic Regression sklearn",
"Logistic Regression-0.14 Threshold",
"Logistic Regression-0.38 Threshold",
]
print("Training performance comparison:")
models_train_comp_df
# testing performance comparison
models_test_comp_df = pd.concat(
[
log_reg_model_test_perf.T,
log_reg_model_test_perf_threshold_auc_roc.T,
log_reg_model_test_perf_threshold_curve.T,
],
axis=1,
)
models_test_comp_df.columns = [
"Logistic Regression sklearn",
"Logistic Regression-0.14 Threshold",
"Logistic Regression-0.38 Threshold",
]
print("Test set performance comparison:")
models_test_comp_df
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
"""
Function to compute different metrics to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
"""
# predicting using the independent variables
pred = model.predict(predictors)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{
"Accuracy": acc,
"Recall": recall,
"Precision": precision,
"F1": f1,
},
index=[0],
)
return df_perf
def confusion_matrix_sklearn(model, predictors, target):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
model = DecisionTreeClassifier(
criterion="gini", class_weight={0: 0.10, 1: 0.90}, random_state=1
)
model.fit(X_train, y_train)
confusion_matrix_sklearn(model, X_train, y_train)
decision_tree_perf_train = model_performance_classification_sklearn(
model, X_train, y_train
)
decision_tree_perf_train
confusion_matrix_sklearn(model, X_test, y_test)
decision_tree_perf_test = model_performance_classification_sklearn(
model, X_test, y_test
)
decision_tree_perf_test
column_names = list(X.columns)
feature_names = column_names
print(feature_names)
plt.figure(figsize=(20, 30))
out = tree.plot_tree(
model,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=True,
class_names=True,
)
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
plt.show()
# Text report showing the rules of a decision tree -
print(tree.export_text(model, feature_names=feature_names, show_weights=True))
importances = model.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
# Choose the type of classifier.
estimator = DecisionTreeClassifier(random_state=1)
# Grid of parameters to choose from
parameters = {
"max_depth": [5, 10, 15, None],
"criterion": ["entropy", "gini"],
"splitter": ["best", "random"],
"min_impurity_decrease": [0.00001, 0.0001, 0.01],
}
# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(recall_score)
# Run the grid search
grid_obj = GridSearchCV(estimator, parameters, scoring=acc_scorer, cv=5)
grid_obj = grid_obj.fit(X_train, y_train)
# Set the clf to the best combination of parameters
estimator = grid_obj.best_estimator_
# Fit the best algorithm to the data.
estimator.fit(X_train, y_train)
decision_tree_tune_perf_train = model_performance_classification_sklearn(
estimator, X_train, y_train
)
decision_tree_tune_perf_train
confusion_matrix_sklearn(estimator, X_train, y_train)
decision_tree_tune_perf_test = model_performance_classification_sklearn(
estimator, X_test, y_test
)
decision_tree_tune_perf_test
## creating a list of column names
feature_names = X_train.columns.to_list()
plt.figure(figsize=(15, 10))
out = tree.plot_tree(
estimator,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=False,
class_names=None,
)
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
plt.show()
# importance of features in the tree building ( The importance of a feature is computed as the
# (normalized) total reduction of the 'criterion' brought by that feature. It is also known as the Gini importance )
print(
pd.DataFrame(
estimator.feature_importances_, columns=["Imp"], index=X_train.columns
).sort_values(by="Imp", ascending=False)
)
# Here we will see that importance of features has increased
importances = estimator.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
clf = DecisionTreeClassifier(random_state=1)
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
pd.DataFrame(path)
fig, ax = plt.subplots(figsize=(15, 5))
ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
plt.show()
clfs = []
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(random_state=1, ccp_alpha=ccp_alpha)
clf.fit(X_train, y_train)
clfs.append(clf)
print(
"Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
clfs[-1].tree_.node_count, ccp_alphas[-1]
)
)
Removing the last element in
clfs and ccp_alphas, because it is the trivial tree with only one
node. Here we show that the number of nodes and tree depth decreases as alpha
increases.
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
fig, ax = plt.subplots(2, 1, figsize=(10, 7))
ax[0].plot(ccp_alphas, node_counts, marker="o", drawstyle="steps-post")
ax[0].set_xlabel("alpha")
ax[0].set_ylabel("number of nodes")
ax[0].set_title("Number of nodes vs alpha")
ax[1].plot(ccp_alphas, depth, marker="o", drawstyle="steps-post")
ax[1].set_xlabel("alpha")
ax[1].set_ylabel("depth of tree")
ax[1].set_title("Depth vs alpha")
fig.tight_layout()
recall_train = []
for clf in clfs:
pred_train = clf.predict(X_train)
values_train = recall_score(y_train, pred_train)
recall_train.append(values_train)
recall_test = []
for clf in clfs:
pred_test = clf.predict(X_test)
values_test = recall_score(y_test, pred_test)
recall_test.append(values_test)
fig, ax = plt.subplots(figsize=(15, 5))
ax.set_xlabel("alpha")
ax.set_ylabel("Recall")
ax.set_title("Recall vs alpha for training and testing sets")
ax.plot(ccp_alphas, recall_train, marker="o", label="train", drawstyle="steps-post")
ax.plot(ccp_alphas, recall_test, marker="o", label="test", drawstyle="steps-post")
ax.legend()
plt.show()
# creating the model where we get highest train and test recall
index_best_model = np.argmax(recall_test)
best_model = clfs[index_best_model]
print(best_model)
decision_tree_postpruned_perf_train = model_performance_classification_sklearn(
best_model, X_train, y_train
)
decision_tree_postpruned_perf_train
confusion_matrix_sklearn(best_model, X_train, y_train)
decision_tree_postpruned_perf_test = model_performance_classification_sklearn(
best_model, X_test, y_test
)
decision_tree_postpruned_perf_test
plt.figure(figsize=(10, 10))
out = tree.plot_tree(
best_model,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=True,
class_names=True,
)
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
plt.show()
plt.show()
# Text report showing the rules of a decision tree -
print(tree.export_text(best_model, feature_names=feature_names, show_weights=True))
# importance of features in the tree building ( The importance of a feature is computed as the
# (normalized) total reduction of the 'criterion' brought by that feature. It is also known as the Gini importance )
print(
pd.DataFrame(
best_model.feature_importances_, columns=["Imp"], index=X_train.columns
).sort_values(by="Imp", ascending=False)
)
importances = best_model.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
# training performance comparison
models_train_comp_df = pd.concat(
[
decision_tree_perf_train.T,
decision_tree_tune_perf_train.T,
decision_tree_postpruned_perf_train.T,
],
axis=1,
)
models_train_comp_df.columns = [
"Decision Tree sklearn",
"Decision Tree (Pre-Pruning)",
"Decision Tree (Post-Pruning)",
]
print("Training performance comparison:")
models_train_comp_df
# test performance comparison
models_train_comp_df = pd.concat(
[
decision_tree_perf_test.T,
decision_tree_tune_perf_test.T,
decision_tree_postpruned_perf_test.T,
],
axis=1,
)
models_train_comp_df.columns = [
"Decision Tree sklearn",
"Decision Tree (Pre-Pruning)",
"Decision Tree (Post-Pruning)",
]
print("Test set performance comparison:")
models_train_comp_df
# training performance comparison
models_train_comp_df = pd.concat(
[
log_reg_model_train_perf.T,
log_reg_model_train_perf_threshold_auc_roc.T,
log_reg_model_train_perf_threshold_curve.T,
decision_tree_perf_train.T,
decision_tree_tune_perf_train.T,
decision_tree_postpruned_perf_train.T,
],
axis=1,
)
models_train_comp_df.columns = [
"Logistic Regression sklearn",
"Logistic Regression-0.14 Threshold",
"Logistic Regression-0.38 Threshold",
"Decision Tree sklearn",
"Decision Tree (Pre-Pruning)",
"Decision Tree (Post-Pruning)",
]
print("Training performance comparison:")
models_train_comp_df
# testing performance comparison
models_test_comp_df = pd.concat(
[
log_reg_model_test_perf.T,
log_reg_model_test_perf_threshold_auc_roc.T,
log_reg_model_test_perf_threshold_curve.T,
decision_tree_perf_test.T,
decision_tree_tune_perf_test.T,
decision_tree_postpruned_perf_test.T,
],
axis=1,
)
models_test_comp_df.columns = [
"Logistic Regression sklearn",
"Logistic Regression-0.14 Threshold",
"Logistic Regression-0.38 Threshold",
"Decision Tree sklearn",
"Decision Tree (Pre-Pruning)",
"Decision Tree (Post-Pruning)",
]
print("Testing performance comparison:")
models_test_comp_df